# -*- encoding: utf-8 -*-
# author:lmolhw
# datetime:2020-4-29 10:19

"""
文件说明：
    处理THUC所有文本数据
"""
import re
import os

def open_txt(file_name):
    with open(file_name, 'r+') as f:
        try:
            line = f.readline()
            while line:
                yield line.strip()
                line = f.readline()

        except:
            print('No value')

path = '/data2/text_eval/recognition/THUCNews_out'

path_list = []

for file in os.listdir(path):
    for c in os.listdir(os.path.join(path, file)):
        file_path = os.path.join(path, file, c)
        path_list.append(file_path.strip())

out_file = open('./THUCNews_lines.txt', 'w')
dict_file = open('./THUCNews_dict.txt', 'w')
dict_list = []
for file in path_list:
    print(file)
    for line in open_txt(file):
        out = re.split(' ', line.strip())
        for j in out:
            if len(j) > 2:
                out_file.writelines(j.strip() + '\n')
                dict_list += set(list(j))

for c in set(dict_list):
    dict_file.writelines(c.strip() + '\n')
